###############################################################################
# 12/19/14                                                                    #
# Replication file for "Estimating the Severity of the WikiLeaks United States# 
# Diplomatic Cables Disclosure "                                              #
#                                                                             #  
# Michael Gill, Arthur Spirling                                               #
###############################################################################

#Note that this file provides code for the four estimators we use, but does 
# NOT provide data, as per the instructions of the Associate Editors of PA

rm(list=ls())

###############################################################################
#####################   MAKE THE DATA   #######################################

# Start with some simulated data representing /one/ embassy-year
# In this example, we will daily generate cables according to a 
# Poisson process, then filter the data to be like the Manning sample.

# Simulate 'true' number of cables generated per day:
cable_rate <- 3 # expected number of cables per day
days <- 1:365 # true daily numeric IDs
n_days <- length(days) # length of year
set.seed(100)
daily_counts <- rpois(n_days, cable_rate) # assign cables per day
true_N <- sum(daily_counts) # true number of cables
# Here, we see the true N is 1111
# --> this is the max of the population we want to estimate 

# Next, we will assign each cable a serial number and a daily indicator.
day_num <- unlist(lapply(days, function(x) rep(x,daily_counts[x])))
true_dat <- data.frame(serial=1:true_N, day_num)

# Now, we will filter the data to be like the Manning sample. Only a subset
# of all simulated cables with the true N as above will make it into the 
# study sample. 
set.seed(100)
manning_dat <- true_dat[which(rbinom(true_N,1,0.05)==1),]
 
# We see this leaves a study sample size of n = dim(manning_dat)[1] = 73.
# (btw, our mean embassy year in the data is 126 obs, so this isn't far off)

# Last, store sample days/daily maxima for regression estimator 
# (we return to this below)
sample_days <- sort(unique(manning_dat$day_num))
day_maxes <- unlist(lapply(sample_days,function(x) max(manning_dat$serial[which(manning_dat$day_num==x)])))

################################################################################
#################### CHECK ASSUMPTIONS ABOUT SAMPLE ARE MET ###################

#1. check that we have a random sample from a unifom (per Goodman's suggestions)
#function ks.proc checks against a uniform
# if p> 0.05, we cannot reject null of uniform ==> sample is uniform
ks.proc <- function(x=manning_dat$serial){
  ksp <- ks.test( x[-which(x==max(x))]/max(x), "punif")$p.value 
  ksp #p > 0.05 --> reject null of uniformness
}
#ks.proc(manning_dat$serial) yields p value = 0.50, so ok.
# Recall: regression estimator is much less dependent on this assumption being met

##############################################################################
######################### DO THE ESTIMATION ##################################

#2.  MLE is very simple:  max of sample.
MLE <- function(x=manning_dat$serial){max(x)}
#MLE(manning_dat$serial) yields 1108 for this example: so, close

#3. Goodman estimator is simple too.
Goodman <- function(x=manning_dat$serial){
  m <- max(x)
  k <- length(x)
  m + m/k - 1
}
#Goodman(manning_dat$serial) yields 1122.2 for this example (slightly over)

#4. Bayesian estimator w improper uniform prior (from Hohle and Held) 
#has helpful closed form for the mean
Bayes <- function(x=manning_dat$serial){
  m <- max(x)   
  k <- length(x)
  ((k-1)*(m-1)/(k-2))
}
#Bayes(manning_dat$serial) yields 1122.6 (slightly over)

#5. Regression estimator
Reg_est <- function(x=sample_days, y=day_maxes, z=n_days){
	lm_fit <- lm(y ~ x)
	as.numeric(coefficients(lm_fit)[1]+n_days*coefficients(lm_fit)[2])
}

#Reg_est(sample_days, day_maxes, 365) yields 1114.9 (slightly over)
